// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "Renderer.hpp"

#include "Clipper.hpp"
#include "Polygon.hpp"
#include "Primitive.hpp"
#include "Vertex.hpp"
#include "Pipeline/Constants.hpp"
#include "Pipeline/SpirvShader.hpp"
#include "Reactor/Reactor.hpp"
#include "System/Debug.hpp"
#include "System/Half.hpp"
#include "System/Math.hpp"
#include "System/Memory.hpp"
#include "System/Timer.hpp"
#include "Vulkan/VkConfig.hpp"
#include "Vulkan/VkDescriptorSet.hpp"
#include "Vulkan/VkDevice.hpp"
#include "Vulkan/VkFence.hpp"
#include "Vulkan/VkImageView.hpp"
#include "Vulkan/VkPipelineLayout.hpp"
#include "Vulkan/VkQueryPool.hpp"

#include "marl/containers.h"
#include "marl/defer.h"
#include "marl/trace.h"

#undef max

#ifndef NDEBUG
unsigned int minPrimitives = 1;
unsigned int maxPrimitives = 1 << 21;
#endif

namespace sw {

template<typename T>
inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, VkProvokingVertexModeEXT provokingVertexMode, T indices, unsigned int start, unsigned int triangleCount)
{
	bool provokeFirst = (provokingVertexMode == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT);

	switch(topology)
	{
	case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
		{
			auto index = start;
			auto pointBatch = &(batch[0][0]);
			for(unsigned int i = 0; i < triangleCount; i++)
			{
				*pointBatch++ = indices[index++];
			}

			// Repeat the last index to allow for SIMD width overrun.
			index--;
			for(unsigned int i = 0; i < 3; i++)
			{
				*pointBatch++ = indices[index];
			}
		}
		break;
	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
		{
			auto index = 2 * start;
			for(unsigned int i = 0; i < triangleCount; i++)
			{
				batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
				batch[i][2] = indices[index + 1];

				index += 2;
			}
		}
		break;
	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
		{
			auto index = start;
			for(unsigned int i = 0; i < triangleCount; i++)
			{
				batch[i][0] = indices[index + (provokeFirst ? 0 : 1)];
				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
				batch[i][2] = indices[index + 1];

				index += 1;
			}
		}
		break;
	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
		{
			auto index = 3 * start;
			for(unsigned int i = 0; i < triangleCount; i++)
			{
				batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
				batch[i][1] = indices[index + (provokeFirst ? 1 : 0)];
				batch[i][2] = indices[index + (provokeFirst ? 2 : 1)];

				index += 3;
			}
		}
		break;
	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
		{
			auto index = start;
			for(unsigned int i = 0; i < triangleCount; i++)
			{
				batch[i][0] = indices[index + (provokeFirst ? 0 : 2)];
				batch[i][1] = indices[index + ((start + i) & 1) + (provokeFirst ? 1 : 0)];
				batch[i][2] = indices[index + (~(start + i) & 1) + (provokeFirst ? 1 : 0)];

				index += 1;
			}
		}
		break;
	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
		{
			auto index = start + 1;
			for(unsigned int i = 0; i < triangleCount; i++)
			{
				batch[i][provokeFirst ? 0 : 2] = indices[index + 0];
				batch[i][provokeFirst ? 1 : 0] = indices[index + 1];
				batch[i][provokeFirst ? 2 : 1] = indices[0];

				index += 1;
			}
		}
		break;
	default:
		ASSERT(false);
		return false;
	}

	return true;
}

DrawCall::DrawCall()
{
	// TODO(b/140991626): Use allocateUninitialized() instead of allocateZeroOrPoison() to improve startup peformance.
	data = (DrawData *)sw::allocateZeroOrPoison(sizeof(DrawData));
}

DrawCall::~DrawCall()
{
	sw::freeMemory(data);
}

Renderer::Renderer(vk::Device *device)
    : device(device)
{
	vertexProcessor.setRoutineCacheSize(1024);
	pixelProcessor.setRoutineCacheSize(1024);
	setupProcessor.setRoutineCacheSize(1024);
}

Renderer::~Renderer()
{
	drawTickets.take().wait();
}

// Renderer objects have to be mem aligned to the alignment provided in the class declaration
void *Renderer::operator new(size_t size)
{
	ASSERT(size == sizeof(Renderer));  // This operator can't be called from a derived class
	return vk::allocateHostMemory(sizeof(Renderer), alignof(Renderer), vk::NULL_ALLOCATION_CALLBACKS, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
}

void Renderer::operator delete(void *mem)
{
	vk::freeHostMemory(mem, vk::NULL_ALLOCATION_CALLBACKS);
}

void Renderer::draw(const vk::GraphicsPipeline *pipeline, const vk::DynamicState &dynamicState, unsigned int count, int baseVertex,
                    CountedEvent *events, int instanceID, int viewID, void *indexBuffer, const VkExtent3D &framebufferExtent,
                    vk::Pipeline::PushConstantStorage const &pushConstants, bool update)
{
	if(count == 0) { return; }

	auto id = nextDrawID++;
	MARL_SCOPED_EVENT("draw %d", id);

	marl::Pool<sw::DrawCall>::Loan draw;
	{
		MARL_SCOPED_EVENT("drawCallPool.borrow()");
		draw = drawCallPool.borrow();
	}
	draw->id = id;

	const vk::GraphicsState &pipelineState = pipeline->getState(dynamicState);
	pixelProcessor.setBlendConstant(pipelineState.getBlendConstants());

	const vk::Inputs &inputs = pipeline->getInputs();

	if(update)
	{
		MARL_SCOPED_EVENT("update");

		const sw::SpirvShader *fragmentShader = pipeline->getShader(VK_SHADER_STAGE_FRAGMENT_BIT).get();
		const sw::SpirvShader *vertexShader = pipeline->getShader(VK_SHADER_STAGE_VERTEX_BIT).get();

		const vk::Attachments attachments = pipeline->getAttachments();

		vertexState = vertexProcessor.update(pipelineState, vertexShader, inputs);
		setupState = setupProcessor.update(pipelineState, fragmentShader, vertexShader, attachments);
		pixelState = pixelProcessor.update(pipelineState, fragmentShader, vertexShader, attachments, hasOcclusionQuery());

		vertexRoutine = vertexProcessor.routine(vertexState, pipelineState.getPipelineLayout(), vertexShader, inputs.getDescriptorSets());
		setupRoutine = setupProcessor.routine(setupState);
		pixelRoutine = pixelProcessor.routine(pixelState, pipelineState.getPipelineLayout(), fragmentShader, inputs.getDescriptorSets());
	}

	draw->containsImageWrite = pipeline->containsImageWrite();

	DrawCall::SetupFunction setupPrimitives = nullptr;
	int ms = pipelineState.getSampleCount();
	unsigned int numPrimitivesPerBatch = MaxBatchSize / ms;

	if(pipelineState.isDrawTriangle(false))
	{
		switch(pipelineState.getPolygonMode())
		{
		case VK_POLYGON_MODE_FILL:
			setupPrimitives = &DrawCall::setupSolidTriangles;
			break;
		case VK_POLYGON_MODE_LINE:
			setupPrimitives = &DrawCall::setupWireframeTriangles;
			numPrimitivesPerBatch /= 3;
			break;
		case VK_POLYGON_MODE_POINT:
			setupPrimitives = &DrawCall::setupPointTriangles;
			numPrimitivesPerBatch /= 3;
			break;
		default:
			UNSUPPORTED("polygon mode: %d", int(pipelineState.getPolygonMode()));
			return;
		}
	}
	else if(pipelineState.isDrawLine(false))
	{
		setupPrimitives = &DrawCall::setupLines;
	}
	else  // Point primitive topology
	{
		setupPrimitives = &DrawCall::setupPoints;
	}

	DrawData *data = draw->data;
	draw->occlusionQuery = occlusionQuery;
	draw->batchDataPool = &batchDataPool;
	draw->numPrimitives = count;
	draw->numPrimitivesPerBatch = numPrimitivesPerBatch;
	draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
	draw->topology = pipelineState.getTopology();
	draw->provokingVertexMode = pipelineState.getProvokingVertexMode();
	draw->indexType = pipeline->getIndexBuffer().getIndexType();
	draw->lineRasterizationMode = pipelineState.getLineRasterizationMode();
	draw->descriptorSetObjects = inputs.getDescriptorSetObjects();
	draw->pipelineLayout = pipelineState.getPipelineLayout();
	draw->depthClipEnable = pipelineState.getDepthClipEnable();

	draw->vertexRoutine = vertexRoutine;
	draw->setupRoutine = setupRoutine;
	draw->pixelRoutine = pixelRoutine;
	draw->setupPrimitives = setupPrimitives;
	draw->setupState = setupState;

	data->descriptorSets = inputs.getDescriptorSets();
	data->descriptorDynamicOffsets = inputs.getDescriptorDynamicOffsets();

	for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
	{
		const sw::Stream &stream = inputs.getStream(i);
		data->input[i] = stream.buffer;
		data->robustnessSize[i] = stream.robustnessSize;
		data->stride[i] = stream.vertexStride;
	}

	data->indices = indexBuffer;
	data->viewID = viewID;
	data->instanceID = instanceID;
	data->baseVertex = baseVertex;

	if(pixelState.stencilActive)
	{
		data->stencil[0].set(pipelineState.getFrontStencil().reference, pipelineState.getFrontStencil().compareMask, pipelineState.getFrontStencil().writeMask);
		data->stencil[1].set(pipelineState.getBackStencil().reference, pipelineState.getBackStencil().compareMask, pipelineState.getBackStencil().writeMask);
	}

	data->lineWidth = pipelineState.getLineWidth();

	data->factor = pixelProcessor.factor;

	if(pixelState.alphaToCoverage)
	{
		if(ms == 4)
		{
			data->a2c0 = float4(0.2f);
			data->a2c1 = float4(0.4f);
			data->a2c2 = float4(0.6f);
			data->a2c3 = float4(0.8f);
		}
		else if(ms == 2)
		{
			data->a2c0 = float4(0.25f);
			data->a2c1 = float4(0.75f);
		}
		else if(ms == 1)
		{
			data->a2c0 = float4(0.5f);
		}
		else
			ASSERT(false);
	}

	if(pixelState.occlusionEnabled)
	{
		for(int cluster = 0; cluster < MaxClusterCount; cluster++)
		{
			data->occlusion[cluster] = 0;
		}
	}

	// Viewport
	{
		const VkViewport &viewport = pipelineState.getViewport();

		float W = 0.5f * viewport.width;
		float H = 0.5f * viewport.height;
		float X0 = viewport.x + W;
		float Y0 = viewport.y + H;
		float N = viewport.minDepth;
		float F = viewport.maxDepth;
		float Z = F - N;
		constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;

		data->WxF = float4(W * subPixF);
		data->HxF = float4(H * subPixF);
		data->X0xF = float4(X0 * subPixF - subPixF / 2);
		data->Y0xF = float4(Y0 * subPixF - subPixF / 2);
		data->halfPixelX = float4(0.5f / W);
		data->halfPixelY = float4(0.5f / H);
		data->viewportHeight = abs(viewport.height);
		data->depthRange = Z;
		data->depthNear = N;
		data->constantDepthBias = pipelineState.getConstantDepthBias();
		data->slopeDepthBias = pipelineState.getSlopeDepthBias();
		data->depthBiasClamp = pipelineState.getDepthBiasClamp();
		data->depthClipEnable = pipelineState.getDepthClipEnable();

		const vk::Attachments attachments = pipeline->getAttachments();
		if(attachments.depthBuffer)
		{
			switch(attachments.depthBuffer->getFormat(VK_IMAGE_ASPECT_DEPTH_BIT))
			{
			case VK_FORMAT_D16_UNORM:
				data->minimumResolvableDepthDifference = 1.0f / 0xFFFF;
				break;
			case VK_FORMAT_D32_SFLOAT:
				// The minimum resolvable depth difference is determined per-polygon for floating-point depth
				// buffers. DrawData::minimumResolvableDepthDifference is unused.
				break;
			default:
				UNSUPPORTED("Depth format: %d", int(attachments.depthBuffer->getFormat(VK_IMAGE_ASPECT_DEPTH_BIT)));
			}
		}
	}

	// Target
	{
		const vk::Attachments attachments = pipeline->getAttachments();

		for(int index = 0; index < MAX_COLOR_BUFFERS; index++)
		{
			draw->colorBuffer[index] = attachments.colorBuffer[index];

			if(draw->colorBuffer[index])
			{
				data->colorBuffer[index] = (unsigned int *)attachments.colorBuffer[index]->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_COLOR_BIT, 0, data->viewID);
				data->colorPitchB[index] = attachments.colorBuffer[index]->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
				data->colorSliceB[index] = attachments.colorBuffer[index]->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
			}
		}

		draw->depthBuffer = attachments.depthBuffer;
		draw->stencilBuffer = attachments.stencilBuffer;

		if(draw->depthBuffer)
		{
			data->depthBuffer = (float *)attachments.depthBuffer->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, data->viewID);
			data->depthPitchB = attachments.depthBuffer->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
			data->depthSliceB = attachments.depthBuffer->slicePitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
		}

		if(draw->stencilBuffer)
		{
			data->stencilBuffer = (unsigned char *)attachments.stencilBuffer->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, data->viewID);
			data->stencilPitchB = attachments.stencilBuffer->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
			data->stencilSliceB = attachments.stencilBuffer->slicePitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
		}
	}

	// Scissor
	{
		const VkRect2D &scissor = pipelineState.getScissor();

		data->scissorX0 = clamp<int>(scissor.offset.x, 0, framebufferExtent.width);
		data->scissorX1 = clamp<int>(scissor.offset.x + scissor.extent.width, 0, framebufferExtent.width);
		data->scissorY0 = clamp<int>(scissor.offset.y, 0, framebufferExtent.height);
		data->scissorY1 = clamp<int>(scissor.offset.y + scissor.extent.height, 0, framebufferExtent.height);
	}

	// Push constants
	{
		data->pushConstants = pushConstants;
	}

	draw->events = events;

	vk::DescriptorSet::PrepareForSampling(draw->descriptorSetObjects, draw->pipelineLayout, device);

	DrawCall::run(device, draw, &drawTickets, clusterQueues);
}

void DrawCall::setup()
{
	if(occlusionQuery != nullptr)
	{
		occlusionQuery->start();
	}

	if(events)
	{
		events->add();
	}
}

void DrawCall::teardown(vk::Device *device)
{
	if(events)
	{
		events->done();
		events = nullptr;
	}

	if(occlusionQuery != nullptr)
	{
		for(int cluster = 0; cluster < MaxClusterCount; cluster++)
		{
			occlusionQuery->add(data->occlusion[cluster]);
		}
		occlusionQuery->finish();
	}

	vertexRoutine = {};
	setupRoutine = {};
	pixelRoutine = {};

	for(auto *target : colorBuffer)
	{
		if(target)
		{
			target->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
		}
	}

	if(containsImageWrite)
	{
		vk::DescriptorSet::ContentsChanged(descriptorSetObjects, pipelineLayout, device);
	}
}

void DrawCall::run(vk::Device *device, const marl::Loan<DrawCall> &draw, marl::Ticket::Queue *tickets, marl::Ticket::Queue clusterQueues[MaxClusterCount])
{
	draw->setup();

	auto const numPrimitives = draw->numPrimitives;
	auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
	auto const numBatches = draw->numBatches;

	auto ticket = tickets->take();
	auto finally = marl::make_shared_finally([device, draw, ticket] {
		MARL_SCOPED_EVENT("FINISH draw %d", draw->id);
		draw->teardown(device);
		ticket.done();
	});

	for(unsigned int batchId = 0; batchId < numBatches; batchId++)
	{
		auto batch = draw->batchDataPool->borrow();
		batch->id = batchId;
		batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
		batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;

		for(int cluster = 0; cluster < MaxClusterCount; cluster++)
		{
			batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
		}

		marl::schedule([device, draw, batch, finally] {
			processVertices(device, draw.get(), batch.get());

			if(!draw->setupState.rasterizerDiscard)
			{
				processPrimitives(device, draw.get(), batch.get());

				if(batch->numVisible > 0)
				{
					processPixels(device, draw, batch, finally);
					return;
				}
			}

			for(int cluster = 0; cluster < MaxClusterCount; cluster++)
			{
				batch->clusterTickets[cluster].done();
			}
		});
	}
}

void DrawCall::processVertices(vk::Device *device, DrawCall *draw, BatchData *batch)
{
	MARL_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);

	unsigned int triangleIndices[MaxBatchSize + 1][3];  // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
	{
		MARL_SCOPED_EVENT("processPrimitiveVertices");
		processPrimitiveVertices(
		    triangleIndices,
		    draw->data->indices,
		    draw->indexType,
		    batch->firstPrimitive,
		    batch->numPrimitives,
		    draw->topology,
		    draw->provokingVertexMode);
	}

	auto &vertexTask = batch->vertexTask;
	vertexTask.primitiveStart = batch->firstPrimitive;
	// We're only using batch compaction for points, not lines
	vertexTask.vertexCount = batch->numPrimitives * ((draw->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ? 1 : 3);
	if(vertexTask.vertexCache.drawCall != draw->id)
	{
		vertexTask.vertexCache.clear();
		vertexTask.vertexCache.drawCall = draw->id;
	}

	draw->vertexRoutine(device, &batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
}

void DrawCall::processPrimitives(vk::Device *device, DrawCall *draw, BatchData *batch)
{
	MARL_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
	auto triangles = &batch->triangles[0];
	auto primitives = &batch->primitives[0];
	batch->numVisible = draw->setupPrimitives(device, triangles, primitives, draw, batch->numPrimitives);
}

void DrawCall::processPixels(vk::Device *device, const marl::Loan<DrawCall> &draw, const marl::Loan<BatchData> &batch, const std::shared_ptr<marl::Finally> &finally)
{
	struct Data
	{
		Data(const marl::Loan<DrawCall> &draw, const marl::Loan<BatchData> &batch, const std::shared_ptr<marl::Finally> &finally)
		    : draw(draw)
		    , batch(batch)
		    , finally(finally)
		{}
		marl::Loan<DrawCall> draw;
		marl::Loan<BatchData> batch;
		std::shared_ptr<marl::Finally> finally;
	};
	auto data = std::make_shared<Data>(draw, batch, finally);
	for(int cluster = 0; cluster < MaxClusterCount; cluster++)
	{
		batch->clusterTickets[cluster].onCall([device, data, cluster] {
			auto &draw = data->draw;
			auto &batch = data->batch;
			MARL_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
			draw->pixelRoutine(device, &batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
			batch->clusterTickets[cluster].done();
		});
	}
}

void Renderer::synchronize()
{
	MARL_SCOPED_EVENT("synchronize");
	auto ticket = drawTickets.take();
	ticket.wait();
	device->updateSamplingRoutineSnapshotCache();
	ticket.done();
}

void DrawCall::processPrimitiveVertices(
    unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
    const void *primitiveIndices,
    VkIndexType indexType,
    unsigned int start,
    unsigned int triangleCount,
    VkPrimitiveTopology topology,
    VkProvokingVertexModeEXT provokingVertexMode)
{
	if(!primitiveIndices)
	{
		struct LinearIndex
		{
			unsigned int operator[](unsigned int i) { return i; }
		};

		if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, LinearIndex(), start, triangleCount))
		{
			return;
		}
	}
	else
	{
		switch(indexType)
		{
		case VK_INDEX_TYPE_UINT16:
			if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint16_t *>(primitiveIndices), start, triangleCount))
			{
				return;
			}
			break;
		case VK_INDEX_TYPE_UINT32:
			if(!setBatchIndices(triangleIndicesOut, topology, provokingVertexMode, static_cast<const uint32_t *>(primitiveIndices), start, triangleCount))
			{
				return;
			}
			break;
			break;
		default:
			ASSERT(false);
			return;
		}
	}

	// setBatchIndices() takes care of the point case, since it's different due to the compaction
	if(topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
	{
		// Repeat the last index to allow for SIMD width overrun.
		triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
		triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
		triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
	}
}

int DrawCall::setupSolidTriangles(vk::Device *device, Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
	auto &state = drawCall->setupState;

	int ms = state.multiSampleCount;
	const DrawData *data = drawCall->data;
	int visible = 0;

	for(int i = 0; i < count; i++, triangles++)
	{
		Vertex &v0 = triangles->v0;
		Vertex &v1 = triangles->v1;
		Vertex &v2 = triangles->v2;

		Polygon polygon(&v0.position, &v1.position, &v2.position);

		if((v0.cullMask | v1.cullMask | v2.cullMask) == 0)
		{
			continue;
		}

		if((v0.clipFlags & v1.clipFlags & v2.clipFlags) != Clipper::CLIP_FINITE)
		{
			continue;
		}

		int clipFlagsOr = v0.clipFlags | v1.clipFlags | v2.clipFlags;
		if(clipFlagsOr != Clipper::CLIP_FINITE)
		{
			if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
			{
				continue;
			}
		}

		if(drawCall->setupRoutine(device, primitives, triangles, &polygon, data))
		{
			primitives += ms;
			visible++;
		}
	}

	return visible;
}

int DrawCall::setupWireframeTriangles(vk::Device *device, Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
	auto &state = drawCall->setupState;

	int ms = state.multiSampleCount;
	int visible = 0;

	for(int i = 0; i < count; i++)
	{
		const Vertex &v0 = triangles[i].v0;
		const Vertex &v1 = triangles[i].v1;
		const Vertex &v2 = triangles[i].v2;

		float A = ((float)v0.projected.y - (float)v2.projected.y) * (float)v1.projected.x +
		          ((float)v2.projected.y - (float)v1.projected.y) * (float)v0.projected.x +
		          ((float)v1.projected.y - (float)v0.projected.y) * (float)v2.projected.x;  // Area

		int w0w1w2 = bit_cast<int>(v0.w) ^
		             bit_cast<int>(v1.w) ^
		             bit_cast<int>(v2.w);

		A = w0w1w2 < 0 ? -A : A;

		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (A >= 0.0f) : (A <= 0.0f);

		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
		{
			if(frontFacing) continue;
		}
		if(state.cullMode & VK_CULL_MODE_BACK_BIT)
		{
			if(!frontFacing) continue;
		}

		Triangle lines[3];
		lines[0].v0 = v0;
		lines[0].v1 = v1;
		lines[1].v0 = v1;
		lines[1].v1 = v2;
		lines[2].v0 = v2;
		lines[2].v1 = v0;

		for(int i = 0; i < 3; i++)
		{
			if(setupLine(device, *primitives, lines[i], *drawCall))
			{
				primitives += ms;
				visible++;
			}
		}
	}

	return visible;
}

int DrawCall::setupPointTriangles(vk::Device *device, Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
	auto &state = drawCall->setupState;

	int ms = state.multiSampleCount;
	int visible = 0;

	for(int i = 0; i < count; i++)
	{
		const Vertex &v0 = triangles[i].v0;
		const Vertex &v1 = triangles[i].v1;
		const Vertex &v2 = triangles[i].v2;

		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
		          (v0.x * v2.y - v0.y * v2.x) * v1.w +
		          (v2.x * v1.y - v1.x * v2.y) * v0.w;

		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
		{
			if(frontFacing) continue;
		}
		if(state.cullMode & VK_CULL_MODE_BACK_BIT)
		{
			if(!frontFacing) continue;
		}

		Triangle points[3];
		points[0].v0 = v0;
		points[1].v0 = v1;
		points[2].v0 = v2;

		for(int i = 0; i < 3; i++)
		{
			if(setupPoint(device, *primitives, points[i], *drawCall))
			{
				primitives += ms;
				visible++;
			}
		}
	}

	return visible;
}

int DrawCall::setupLines(vk::Device *device, Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
	auto &state = drawCall->setupState;

	int visible = 0;
	int ms = state.multiSampleCount;

	for(int i = 0; i < count; i++)
	{
		if(setupLine(device, *primitives, *triangles, *drawCall))
		{
			primitives += ms;
			visible++;
		}

		triangles++;
	}

	return visible;
}

int DrawCall::setupPoints(vk::Device *device, Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
	auto &state = drawCall->setupState;

	int visible = 0;
	int ms = state.multiSampleCount;

	for(int i = 0; i < count; i++)
	{
		if(setupPoint(device, *primitives, *triangles, *drawCall))
		{
			primitives += ms;
			visible++;
		}

		triangles++;
	}

	return visible;
}

bool DrawCall::setupLine(vk::Device *device, Primitive &primitive, Triangle &triangle, const DrawCall &draw)
{
	const DrawData &data = *draw.data;

	float lineWidth = data.lineWidth;

	Vertex &v0 = triangle.v0;
	Vertex &v1 = triangle.v1;

	if((v0.cullMask | v1.cullMask) == 0)
	{
		return false;
	}

	const float4 &P0 = v0.position;
	const float4 &P1 = v1.position;

	if(P0.w <= 0 && P1.w <= 0)
	{
		return false;
	}

	constexpr float subPixF = vk::SUBPIXEL_PRECISION_FACTOR;

	const float W = data.WxF[0] * (1.0f / subPixF);
	const float H = data.HxF[0] * (1.0f / subPixF);

	float dx = W * (P1.x / P1.w - P0.x / P0.w);
	float dy = H * (P1.y / P1.w - P0.y / P0.w);

	if(dx == 0 && dy == 0)
	{
		return false;
	}

	if(draw.lineRasterizationMode != VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT)
	{
		// Rectangle centered on the line segment

		float4 P[4];
		int C[4];

		P[0] = P0;
		P[1] = P1;
		P[2] = P1;
		P[3] = P0;

		float scale = lineWidth * 0.5f / sqrt(dx * dx + dy * dy);

		dx *= scale;
		dy *= scale;

		float dx0h = dx * P0.w / H;
		float dy0w = dy * P0.w / W;

		float dx1h = dx * P1.w / H;
		float dy1w = dy * P1.w / W;

		P[0].x += -dy0w;
		P[0].y += +dx0h;
		C[0] = Clipper::ComputeClipFlags(P[0], draw.depthClipEnable);

		P[1].x += -dy1w;
		P[1].y += +dx1h;
		C[1] = Clipper::ComputeClipFlags(P[1], draw.depthClipEnable);

		P[2].x += +dy1w;
		P[2].y += -dx1h;
		C[2] = Clipper::ComputeClipFlags(P[2], draw.depthClipEnable);

		P[3].x += +dy0w;
		P[3].y += -dx0h;
		C[3] = Clipper::ComputeClipFlags(P[3], draw.depthClipEnable);

		if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
		{
			Polygon polygon(P, 4);

			int clipFlagsOr = C[0] | C[1] | C[2] | C[3];

			if(clipFlagsOr != Clipper::CLIP_FINITE)
			{
				if(!Clipper::Clip(polygon, clipFlagsOr, draw))
				{
					return false;
				}
			}

			return draw.setupRoutine(device, &primitive, &triangle, &polygon, &data);
		}
	}
	else if(false)  // TODO(b/80135519): Deprecate
	{
		// Connecting diamonds polygon
		// This shape satisfies the diamond test convention, except for the exit rule part.
		// Line segments with overlapping endpoints have duplicate fragments.
		// The ideal algorithm requires half-open line rasterization (b/80135519).

		float4 P[8];
		int C[8];

		P[0] = P0;
		P[1] = P0;
		P[2] = P0;
		P[3] = P0;
		P[4] = P1;
		P[5] = P1;
		P[6] = P1;
		P[7] = P1;

		float dx0 = lineWidth * 0.5f * P0.w / W;
		float dy0 = lineWidth * 0.5f * P0.w / H;

		float dx1 = lineWidth * 0.5f * P1.w / W;
		float dy1 = lineWidth * 0.5f * P1.w / H;

		P[0].x += -dx0;
		C[0] = Clipper::ComputeClipFlags(P[0], draw.depthClipEnable);

		P[1].y += +dy0;
		C[1] = Clipper::ComputeClipFlags(P[1], draw.depthClipEnable);

		P[2].x += +dx0;
		C[2] = Clipper::ComputeClipFlags(P[2], draw.depthClipEnable);

		P[3].y += -dy0;
		C[3] = Clipper::ComputeClipFlags(P[3], draw.depthClipEnable);

		P[4].x += -dx1;
		C[4] = Clipper::ComputeClipFlags(P[4], draw.depthClipEnable);

		P[5].y += +dy1;
		C[5] = Clipper::ComputeClipFlags(P[5], draw.depthClipEnable);

		P[6].x += +dx1;
		C[6] = Clipper::ComputeClipFlags(P[6], draw.depthClipEnable);

		P[7].y += -dy1;
		C[7] = Clipper::ComputeClipFlags(P[7], draw.depthClipEnable);

		if((C[0] & C[1] & C[2] & C[3] & C[4] & C[5] & C[6] & C[7]) == Clipper::CLIP_FINITE)
		{
			float4 L[6];

			if(dx > -dy)
			{
				if(dx > dy)  // Right
				{
					L[0] = P[0];
					L[1] = P[1];
					L[2] = P[5];
					L[3] = P[6];
					L[4] = P[7];
					L[5] = P[3];
				}
				else  // Down
				{
					L[0] = P[0];
					L[1] = P[4];
					L[2] = P[5];
					L[3] = P[6];
					L[4] = P[2];
					L[5] = P[3];
				}
			}
			else
			{
				if(dx > dy)  // Up
				{
					L[0] = P[0];
					L[1] = P[1];
					L[2] = P[2];
					L[3] = P[6];
					L[4] = P[7];
					L[5] = P[4];
				}
				else  // Left
				{
					L[0] = P[1];
					L[1] = P[2];
					L[2] = P[3];
					L[3] = P[7];
					L[4] = P[4];
					L[5] = P[5];
				}
			}

			Polygon polygon(L, 6);

			int clipFlagsOr = C[0] | C[1] | C[2] | C[3] | C[4] | C[5] | C[6] | C[7];

			if(clipFlagsOr != Clipper::CLIP_FINITE)
			{
				if(!Clipper::Clip(polygon, clipFlagsOr, draw))
				{
					return false;
				}
			}

			return draw.setupRoutine(device, &primitive, &triangle, &polygon, &data);
		}
	}
	else
	{
		// Parallelogram approximating Bresenham line
		// This algorithm does not satisfy the ideal diamond-exit rule, but does avoid the
		// duplicate fragment rasterization problem and satisfies all of Vulkan's minimum
		// requirements for Bresenham line segment rasterization.

		float4 P[8];
		P[0] = P0;
		P[1] = P0;
		P[2] = P0;
		P[3] = P0;
		P[4] = P1;
		P[5] = P1;
		P[6] = P1;
		P[7] = P1;

		float dx0 = lineWidth * 0.5f * P0.w / W;
		float dy0 = lineWidth * 0.5f * P0.w / H;

		float dx1 = lineWidth * 0.5f * P1.w / W;
		float dy1 = lineWidth * 0.5f * P1.w / H;

		P[0].x += -dx0;
		P[1].y += +dy0;
		P[2].x += +dx0;
		P[3].y += -dy0;
		P[4].x += -dx1;
		P[5].y += +dy1;
		P[6].x += +dx1;
		P[7].y += -dy1;

		float4 L[4];

		if(dx > -dy)
		{
			if(dx > dy)  // Right
			{
				L[0] = P[1];
				L[1] = P[5];
				L[2] = P[7];
				L[3] = P[3];
			}
			else  // Down
			{
				L[0] = P[0];
				L[1] = P[4];
				L[2] = P[6];
				L[3] = P[2];
			}
		}
		else
		{
			if(dx > dy)  // Up
			{
				L[0] = P[0];
				L[1] = P[2];
				L[2] = P[6];
				L[3] = P[4];
			}
			else  // Left
			{
				L[0] = P[1];
				L[1] = P[3];
				L[2] = P[7];
				L[3] = P[5];
			}
		}

		int C0 = Clipper::ComputeClipFlags(L[0], draw.depthClipEnable);
		int C1 = Clipper::ComputeClipFlags(L[1], draw.depthClipEnable);
		int C2 = Clipper::ComputeClipFlags(L[2], draw.depthClipEnable);
		int C3 = Clipper::ComputeClipFlags(L[3], draw.depthClipEnable);

		if((C0 & C1 & C2 & C3) == Clipper::CLIP_FINITE)
		{
			Polygon polygon(L, 4);

			int clipFlagsOr = C0 | C1 | C2 | C3;

			if(clipFlagsOr != Clipper::CLIP_FINITE)
			{
				if(!Clipper::Clip(polygon, clipFlagsOr, draw))
				{
					return false;
				}
			}

			return draw.setupRoutine(device, &primitive, &triangle, &polygon, &data);
		}
	}

	return false;
}

bool DrawCall::setupPoint(vk::Device *device, Primitive &primitive, Triangle &triangle, const DrawCall &draw)
{
	const DrawData &data = *draw.data;

	Vertex &v = triangle.v0;

	if(v.cullMask == 0)
	{
		return false;
	}

	float pSize = v.pointSize;

	pSize = clamp(pSize, 1.0f, static_cast<float>(vk::MAX_POINT_SIZE));

	float4 P[4];
	int C[4];

	P[0] = v.position;
	P[1] = v.position;
	P[2] = v.position;
	P[3] = v.position;

	const float X = pSize * P[0].w * data.halfPixelX[0];
	const float Y = pSize * P[0].w * data.halfPixelY[0];

	P[0].x -= X;
	P[0].y += Y;
	C[0] = Clipper::ComputeClipFlags(P[0], draw.depthClipEnable);

	P[1].x += X;
	P[1].y += Y;
	C[1] = Clipper::ComputeClipFlags(P[1], draw.depthClipEnable);

	P[2].x += X;
	P[2].y -= Y;
	C[2] = Clipper::ComputeClipFlags(P[2], draw.depthClipEnable);

	P[3].x -= X;
	P[3].y -= Y;
	C[3] = Clipper::ComputeClipFlags(P[3], draw.depthClipEnable);

	Polygon polygon(P, 4);

	if((C[0] & C[1] & C[2] & C[3]) == Clipper::CLIP_FINITE)
	{
		int clipFlagsOr = C[0] | C[1] | C[2] | C[3];

		if(clipFlagsOr != Clipper::CLIP_FINITE)
		{
			if(!Clipper::Clip(polygon, clipFlagsOr, draw))
			{
				return false;
			}
		}

		primitive.pointSizeInv = 1.0f / pSize;

		return draw.setupRoutine(device, &primitive, &triangle, &polygon, &data);
	}

	return false;
}

void Renderer::addQuery(vk::Query *query)
{
	ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
	ASSERT(!occlusionQuery);

	occlusionQuery = query;
}

void Renderer::removeQuery(vk::Query *query)
{
	ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
	ASSERT(occlusionQuery == query);

	occlusionQuery = nullptr;
}

}  // namespace sw
